{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "from __future__ import absolute_import\n",
    "from __future__ import division\n",
    "from __future__ import print_function"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/ec2-user/anaconda3/envs/tensorflow_p36/lib/python3.6/importlib/_bootstrap.py:219: RuntimeWarning: compiletime version 3.5 of module 'tensorflow.python.framework.fast_tensor_util' does not match runtime version 3.6\n",
      "  return f(*args, **kwds)\n",
      "/home/ec2-user/anaconda3/envs/tensorflow_p36/lib/python3.6/site-packages/h5py/__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.\n",
      "  from ._conv import register_converters as _register_converters\n",
      "Using TensorFlow backend.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "You have TensorFlow version 1.4.0\n"
     ]
    }
   ],
   "source": [
    "import itertools\n",
    "import os\n",
    "\n",
    "%matplotlib inline\n",
    "import matplotlib.pyplot as plt\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import tensorflow as tf\n",
    "\n",
    "from sklearn.preprocessing import LabelBinarizer, LabelEncoder\n",
    "from sklearn.metrics import confusion_matrix\n",
    "\n",
    "from tensorflow import keras\n",
    "from keras.models import Sequential\n",
    "from keras.layers import Dense, Activation, Dropout\n",
    "from keras.preprocessing import text, sequence\n",
    "from keras import utils\n",
    "\n",
    "# This code was tested with TensorFlow v1.4\n",
    "print(\"You have TensorFlow version\", tf.__version__)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Date received</th>\n",
       "      <th>Product</th>\n",
       "      <th>Sub-product</th>\n",
       "      <th>Issue</th>\n",
       "      <th>Sub-issue</th>\n",
       "      <th>Consumer complaint narrative</th>\n",
       "      <th>Company public response</th>\n",
       "      <th>Company</th>\n",
       "      <th>State</th>\n",
       "      <th>ZIP code</th>\n",
       "      <th>Tags</th>\n",
       "      <th>Consumer consent provided?</th>\n",
       "      <th>Submitted via</th>\n",
       "      <th>Date sent to company</th>\n",
       "      <th>Company response to consumer</th>\n",
       "      <th>Timely response?</th>\n",
       "      <th>Consumer disputed?</th>\n",
       "      <th>Complaint ID</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>03/12/2014</td>\n",
       "      <td>Mortgage</td>\n",
       "      <td>Other mortgage</td>\n",
       "      <td>Loan modification,collection,foreclosure</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>M&amp;T BANK CORPORATION</td>\n",
       "      <td>MI</td>\n",
       "      <td>48382</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Referral</td>\n",
       "      <td>03/17/2014</td>\n",
       "      <td>Closed with explanation</td>\n",
       "      <td>Yes</td>\n",
       "      <td>No</td>\n",
       "      <td>759217.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>10/01/2016</td>\n",
       "      <td>Credit reporting</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Incorrect information on credit report</td>\n",
       "      <td>Account status</td>\n",
       "      <td>I have outdated information on my credit repor...</td>\n",
       "      <td>Company has responded to the consumer and the ...</td>\n",
       "      <td>TRANSUNION INTERMEDIATE HOLDINGS, INC.</td>\n",
       "      <td>AL</td>\n",
       "      <td>352XX</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Consent provided</td>\n",
       "      <td>Web</td>\n",
       "      <td>10/05/2016</td>\n",
       "      <td>Closed with explanation</td>\n",
       "      <td>Yes</td>\n",
       "      <td>No</td>\n",
       "      <td>2141773.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>10/17/2016</td>\n",
       "      <td>Consumer Loan</td>\n",
       "      <td>Vehicle loan</td>\n",
       "      <td>Managing the loan or lease</td>\n",
       "      <td>NaN</td>\n",
       "      <td>I purchased a new car on XXXX XXXX. The car de...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>CITIZENS FINANCIAL GROUP, INC.</td>\n",
       "      <td>PA</td>\n",
       "      <td>177XX</td>\n",
       "      <td>Older American</td>\n",
       "      <td>Consent provided</td>\n",
       "      <td>Web</td>\n",
       "      <td>10/20/2016</td>\n",
       "      <td>Closed with explanation</td>\n",
       "      <td>Yes</td>\n",
       "      <td>No</td>\n",
       "      <td>2163100.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>06/08/2014</td>\n",
       "      <td>Credit card</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Bankruptcy</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>AMERICAN EXPRESS COMPANY</td>\n",
       "      <td>ID</td>\n",
       "      <td>83854</td>\n",
       "      <td>Older American</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Web</td>\n",
       "      <td>06/10/2014</td>\n",
       "      <td>Closed with explanation</td>\n",
       "      <td>Yes</td>\n",
       "      <td>Yes</td>\n",
       "      <td>885638.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>09/13/2014</td>\n",
       "      <td>Debt collection</td>\n",
       "      <td>Credit card</td>\n",
       "      <td>Communication tactics</td>\n",
       "      <td>Frequent or repeated calls</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>CITIBANK, N.A.</td>\n",
       "      <td>VA</td>\n",
       "      <td>23233</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Web</td>\n",
       "      <td>09/13/2014</td>\n",
       "      <td>Closed with explanation</td>\n",
       "      <td>Yes</td>\n",
       "      <td>Yes</td>\n",
       "      <td>1027760.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  Date received           Product     Sub-product  \\\n",
       "0    03/12/2014          Mortgage  Other mortgage   \n",
       "1    10/01/2016  Credit reporting             NaN   \n",
       "2    10/17/2016     Consumer Loan    Vehicle loan   \n",
       "3    06/08/2014       Credit card             NaN   \n",
       "4    09/13/2014   Debt collection     Credit card   \n",
       "\n",
       "                                      Issue                   Sub-issue  \\\n",
       "0  Loan modification,collection,foreclosure                         NaN   \n",
       "1    Incorrect information on credit report              Account status   \n",
       "2                Managing the loan or lease                         NaN   \n",
       "3                                Bankruptcy                         NaN   \n",
       "4                     Communication tactics  Frequent or repeated calls   \n",
       "\n",
       "                        Consumer complaint narrative  \\\n",
       "0                                                NaN   \n",
       "1  I have outdated information on my credit repor...   \n",
       "2  I purchased a new car on XXXX XXXX. The car de...   \n",
       "3                                                NaN   \n",
       "4                                                NaN   \n",
       "\n",
       "                             Company public response  \\\n",
       "0                                                NaN   \n",
       "1  Company has responded to the consumer and the ...   \n",
       "2                                                NaN   \n",
       "3                                                NaN   \n",
       "4                                                NaN   \n",
       "\n",
       "                                  Company State ZIP code            Tags  \\\n",
       "0                    M&T BANK CORPORATION    MI    48382             NaN   \n",
       "1  TRANSUNION INTERMEDIATE HOLDINGS, INC.    AL    352XX             NaN   \n",
       "2          CITIZENS FINANCIAL GROUP, INC.    PA    177XX  Older American   \n",
       "3                AMERICAN EXPRESS COMPANY    ID    83854  Older American   \n",
       "4                          CITIBANK, N.A.    VA    23233             NaN   \n",
       "\n",
       "  Consumer consent provided? Submitted via Date sent to company  \\\n",
       "0                        NaN      Referral           03/17/2014   \n",
       "1           Consent provided           Web           10/05/2016   \n",
       "2           Consent provided           Web           10/20/2016   \n",
       "3                        NaN           Web           06/10/2014   \n",
       "4                        NaN           Web           09/13/2014   \n",
       "\n",
       "  Company response to consumer Timely response? Consumer disputed?  \\\n",
       "0      Closed with explanation              Yes                 No   \n",
       "1      Closed with explanation              Yes                 No   \n",
       "2      Closed with explanation              Yes                 No   \n",
       "3      Closed with explanation              Yes                Yes   \n",
       "4      Closed with explanation              Yes                Yes   \n",
       "\n",
       "   Complaint ID  \n",
       "0      759217.0  \n",
       "1     2141773.0  \n",
       "2     2163100.0  \n",
       "3      885638.0  \n",
       "4     1027760.0  "
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = pd.read_csv('Consumer_Complaints.csv', encoding='latin-1')\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Consumer complaint narrative</th>\n",
       "      <th>Product</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>I have outdated information on my credit repor...</td>\n",
       "      <td>Credit reporting</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>I purchased a new car on XXXX XXXX. The car de...</td>\n",
       "      <td>Consumer Loan</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>An account on my credit report has a mistaken ...</td>\n",
       "      <td>Credit reporting</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>This company refuses to provide me verificatio...</td>\n",
       "      <td>Debt collection</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16</th>\n",
       "      <td>This complaint is in regards to Square Two Fin...</td>\n",
       "      <td>Debt collection</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                         Consumer complaint narrative           Product\n",
       "1   I have outdated information on my credit repor...  Credit reporting\n",
       "2   I purchased a new car on XXXX XXXX. The car de...     Consumer Loan\n",
       "7   An account on my credit report has a mistaken ...  Credit reporting\n",
       "12  This company refuses to provide me verificatio...   Debt collection\n",
       "16  This complaint is in regards to Square Two Fin...   Debt collection"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "col = ['Consumer complaint narrative', 'Product']\n",
    "df = df[col]\n",
    "df = df[pd.notnull(df['Consumer complaint narrative'])]\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Consumer complaint narrative    0\n",
       "Product                         0\n",
       "dtype: int64"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.isnull().sum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Debt collection                                                                 2575\n",
       "Mortgage                                                                        2121\n",
       "Credit reporting                                                                2049\n",
       "Credit card                                                                     1237\n",
       "Bank account or service                                                          950\n",
       "Student loan                                                                     770\n",
       "Consumer Loan                                                                    607\n",
       "Payday loan                                                                      111\n",
       "Money transfers                                                                  104\n",
       "Prepaid card                                                                      90\n",
       "Other financial service                                                           26\n",
       "Virtual currency                                                                   2\n",
       "Credit reporting, credit repair services, or other personal consumer reports       1\n",
       "Name: Product, dtype: int64"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df['Product'].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Train size: 8514\n",
      "Test size: 2129\n"
     ]
    }
   ],
   "source": [
    "# Split data into train and test\n",
    "train_size = int(len(df) * .8)\n",
    "print (\"Train size: %d\" % train_size)\n",
    "print (\"Test size: %d\" % (len(df) - train_size))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_narrative = df['Consumer complaint narrative'][:train_size]\n",
    "train_product = df['Product'][:train_size]\n",
    "\n",
    "test_narrative = df['Consumer complaint narrative'][train_size:]\n",
    "test_product = df['Product'][train_size:]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "max_words = 1000\n",
    "tokenize = text.Tokenizer(num_words=max_words, char_level=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "tokenize.fit_on_texts(train_narrative) # only fit on train\n",
    "x_train = tokenize.texts_to_matrix(train_narrative)\n",
    "x_test = tokenize.texts_to_matrix(test_narrative)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Use sklearn utility to convert label strings to numbered index\n",
    "encoder = LabelEncoder()\n",
    "encoder.fit(train_product)\n",
    "y_train = encoder.transform(train_product)\n",
    "y_test = encoder.transform(test_product)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Converts the labels to a one-hot representation\n",
    "num_classes = np.max(y_train) + 1\n",
    "y_train = utils.to_categorical(y_train, num_classes)\n",
    "y_test = utils.to_categorical(y_test, num_classes)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "x_train shape: (8514, 1000)\n",
      "x_test shape: (2129, 1000)\n",
      "y_train shape: (8514, 13)\n",
      "y_test shape: (2129, 13)\n"
     ]
    }
   ],
   "source": [
    "# Inspect the dimenstions of our training and test data (this is helpful to debug)\n",
    "print('x_train shape:', x_train.shape)\n",
    "print('x_test shape:', x_test.shape)\n",
    "print('y_train shape:', y_train.shape)\n",
    "print('y_test shape:', y_test.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "batch_size = 32\n",
    "epochs = 5"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Build the model\n",
    "model = Sequential()\n",
    "model.add(Dense(512, input_shape=(max_words,)))\n",
    "model.add(Activation('relu'))\n",
    "model.add(Dropout(0.5))\n",
    "model.add(Dense(num_classes))\n",
    "model.add(Activation('softmax'))\n",
    "\n",
    "model.compile(loss='categorical_crossentropy',\n",
    "              optimizer='adam',\n",
    "              metrics=['accuracy'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Train on 7662 samples, validate on 852 samples\n",
      "Epoch 1/5\n",
      "7662/7662 [==============================] - 6s 754us/step - loss: 1.0964 - acc: 0.6784 - val_loss: 0.7574 - val_acc: 0.7864\n",
      "Epoch 2/5\n",
      "7662/7662 [==============================] - 2s 236us/step - loss: 0.5770 - acc: 0.8315 - val_loss: 0.6896 - val_acc: 0.7946\n",
      "Epoch 3/5\n",
      "7662/7662 [==============================] - 2s 235us/step - loss: 0.4344 - acc: 0.8656 - val_loss: 0.7040 - val_acc: 0.7911\n",
      "Epoch 4/5\n",
      "7662/7662 [==============================] - 2s 235us/step - loss: 0.3274 - acc: 0.9003 - val_loss: 0.6989 - val_acc: 0.8005\n",
      "Epoch 5/5\n",
      "7662/7662 [==============================] - 2s 235us/step - loss: 0.2458 - acc: 0.9244 - val_loss: 0.7474 - val_acc: 0.7981\n"
     ]
    }
   ],
   "source": [
    "history = model.fit(x_train, y_train,\n",
    "                    batch_size=batch_size,\n",
    "                    epochs=epochs,\n",
    "                    verbose=1,\n",
    "                    validation_split=0.1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "2129/2129 [==============================] - 0s 60us/step\n",
      "Test score: 0.7458917609734846\n",
      "Test accuracy: 0.789572569393339\n"
     ]
    }
   ],
   "source": [
    "# Evaluate the accuracy of our trained model\n",
    "score = model.evaluate(x_test, y_test,\n",
    "                       batch_size=batch_size, verbose=1)\n",
    "print('Test score:', score[0])\n",
    "print('Test accuracy:', score[1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "I received an employment offer from XXXX XXXX cabl ...\n",
      "Actual label:Credit reporting\n",
      "Predicted label: Credit reporting\n",
      "\n",
      "citi bank offered me for a citi XXXX XXXX XXXX XXX ...\n",
      "Actual label:Credit card\n",
      "Predicted label: Credit card\n",
      "\n",
      "This company made inquiries to my credit report. I ...\n",
      "Actual label:Consumer Loan\n",
      "Predicted label: Credit reporting\n",
      "\n",
      "I was put on an account with XXXX college. I was s ...\n",
      "Actual label:Debt collection\n",
      "Predicted label: Debt collection\n",
      "\n",
      "I can not get my mortgage company to send me a bil ...\n",
      "Actual label:Mortgage\n",
      "Predicted label: Mortgage\n",
      "\n",
      "I started receiving calls from Amsher XX/XX/2016 i ...\n",
      "Actual label:Debt collection\n",
      "Predicted label: Debt collection\n",
      "\n",
      "Equifax is ignoring my numerous requests to invest ...\n",
      "Actual label:Credit reporting\n",
      "Predicted label: Credit reporting\n",
      "\n",
      "The personal identities of my wife and I, which we ...\n",
      "Actual label:Debt collection\n",
      "Predicted label: Bank account or service\n",
      "\n",
      "The company XXXX wanted to hire me to purchase pro ...\n",
      "Actual label:Credit card\n",
      "Predicted label: Credit card\n",
      "\n",
      "TeleCheck doing business as TRS RECOVERY SERVICES, ...\n",
      "Actual label:Debt collection\n",
      "Predicted label: Bank account or service\n",
      "\n"
     ]
    }
   ],
   "source": [
    "# Here's how to generate a prediction on individual examples\n",
    "text_labels = encoder.classes_ \n",
    "\n",
    "for i in range(10):\n",
    "    prediction = model.predict(np.array([x_test[i]]))\n",
    "    predicted_label = text_labels[np.argmax(prediction)]\n",
    "    print(test_narrative.iloc[i][:50], \"...\")\n",
    "    print('Actual label:' + test_product.iloc[i])\n",
    "    print(\"Predicted label: \" + predicted_label + \"\\n\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "conda_tensorflow_p36",
   "language": "python",
   "name": "conda_tensorflow_p36"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
